library(cjoint)
library(dplyr)
library(data.table)
library(ggplot2)
library(stringr)

query_map_object <- function(x, keys_to_query, map_obj) ifelse(x %in% keys_to_query, map_obj[[x]], x)

prepare_conjoint_format <- function(initial_data_file, 
                                    colname_map,
                                    attribute_map){
  
  var_ids <- unname(unlist(initial_data_file[2, ]))
  id_names <- stringr::str_split_fixed(var_ids, pattern = ":", n=2)[, 2]

  # replaces question IDs with qualtrics internal ID. 
  # Qualtrics internal ID are assigned at question creation and do not change
  # when new questions are added
  
  regex_pattern <- "\"[}]"
  id_names <- str_remove_all(id_names, regex_pattern)
  id_names <- str_remove_all(id_names, "\\\"")
  id_names <- str_remove_all(id_names, "[}]")
  id_names <- str_replace(id_names, pattern = "QID", replacement = "Q")
  id_names <- str_replace(id_names, pattern = "_recordId", replacement = "ResponseId")
  
  colnames(initial_data_file) <- id_names
  
  # removes the first few rows on the qualtrics formats
  removed_initial_rows <- initial_data_file %>% 
    slice(3:n()) %>% 
    mutate(respondent = 1:n())
  
  removed_initial_rows <- removed_initial_rows %>% 
    filter(finished != "0")
  
  
  # load covariate names 
  covars <- names(colnames_map)
  covars <- c(covars, "duration")
  colnames_map[["duration"]] <- "duration"
  
  variables <- removed_initial_rows %>% 
    dplyr::select(ResponseId, all_of(covars))
  
  conjoint_variables <- removed_initial_rows %>% 
    dplyr::select(
      ResponseId,
      all_of(first_outcome),
      all_of(second_outcome),
      all_of(third_outcome),
      all_of(fourth_outcome),
      matches("F-[1-9]-[1-9]-"))
  
  # Candidates were shown in pairs during the conjoint experiment 
  # These helper functions select variables associated to the first and second 
  # profile 
  
  select_first_candidate <- function(x) conjoint_variables %>% select(matches(paste0("F-", x, "-1-[1-9]")), 
                                                                      all_of(first_outcome[x]),
                                                                      all_of(second_outcome[x]),
                                                                      all_of(third_outcome[x]),
                                                                      all_of(fourth_outcome[x]),
                                                                      ResponseId)
  
  select_second_candidate <- function(x) conjoint_variables %>% select(matches(paste0("F-", x, "-2-[1-9]")),
                                                                       all_of(first_outcome[x]),
                                                                       all_of(second_outcome[x]),
                                                                       all_of(third_outcome[x]),
                                                                       all_of(fourth_outcome[x]),
                                                                       ResponseId)
  
  first_candidates <- lapply(1:8, FUN = select_first_candidate)
  second_candidates <- lapply(1:8, FUN = select_second_candidate)
  
  
  attr_names <- c(
    "university",
    "high_school",
    "job", 
    "time_on_job",
    "genshoku",
    "father_job",
    "policy_position",
    "last_name",
    "gender",
    "selected",
    "good_me",
    "good_region",
    "good_country",
    "ResponseId")
  
  first_candidates <- lapply(first_candidates, setNames, attr_names)
  second_candidates <- lapply(second_candidates, setNames, attr_names)
  
  outcome_vars <- c("selected", "good_region", "good_country", "good_me")
  
  # add selection indicator 
  first_candidates <- lapply(first_candidates, process_outcome_first_cand, outcome_vars)
  second_candidates <- lapply(second_candidates, process_outcome_second_cand, outcome_vars)
  
  
  first_candidate_with_respondent_var <- lapply(first_candidates, FUN = function(x) left_join(x, variables, by = "ResponseId"))
  second_candidate_with_respondent_var <- lapply(second_candidates, FUN = function(x) left_join(x, variables, by = "ResponseId"))
  
  # Adds a task number variable to know the order in which tasks were presented
  first_candidate_with_respondent_var <- lapply(
    1:8, function(x) mutate(first_candidate_with_respondent_var[[x]], task_number = x))
  second_candidate_with_respondent_var <- lapply(
    1:8, function(x) mutate(second_candidate_with_respondent_var[[x]], task_number = x))
  
  first_candidates_df <- data.table::rbindlist(first_candidate_with_respondent_var)
  second_candidates_df <- data.table::rbindlist(second_candidate_with_respondent_var)
  
  first_candidates_df <- first_candidates_df %>% mutate(first_candidate = 1)
  second_candidates_df <- second_candidates_df %>% mutate(first_candidate = 0)
  
  formatted_data <- rbind.data.frame(first_candidates_df, second_candidates_df)
  formatted_data <- as.data.frame(formatted_data)
  
  formatted_data <- formatted_data %>% 
    rename(vote_choice = selected)
  
  
  attr_cols <- names(attribute_map)
  
  # removes observations that did not complete the survey
  remove_nonresponse_mask <- formatted_data %>% 
    select(all_of(attr_cols)) %>% 
    apply(., MARGIN = 1, FUN = function(x) !any(x == ""))
  
  formatted_data <- formatted_data %>% 
    filter(remove_nonresponse_mask)
  
  colnames(formatted_data) <- sapply(colnames(formatted_data), query_map_object, covars, colnames_map)
  
  return(formatted_data)
}


generate_new_variables <- function(survey_data){
  # Adds indicator variables on the basis of more detailed survey answers
  survey_data <- survey_data %>% 
    mutate(urbanZ_alt = as.factor(as.numeric((city_size == 3) & cur_pref %in% c(1, 13, 23, 27, 40))),
           urban = as.factor(as.numeric((city_size == 3))),
           male_resp = as.factor(as.numeric(resp_gender == 0)),
           ikkyoku_high = as.factor(as.numeric(as.numeric(as.character(concern_ikkyokushuchu)) > 5)),
           shoshika_high = as.factor(as.numeric(as.numeric(as.character(concern_shoshika)) > 6)),
           older = as.factor(as.numeric(birth_year) <= 1976))
  
  survey_data <- survey_data %>% 
    mutate(seshu = as.factor(as.numeric(father_job %in% c("${outside_prefecture}選出の国会議員", "${respondent_prefecture}選出の国会議員"))))
  
  formatted_data <- formatted_data %>% 
    mutate(moved_prefecture = as.factor(as.numeric(cur_pref != young_pref)))
  
  
  return(survey_data)
}


fix_variable_format <- function(survey_data, colnames_map){
  
  # Sets the proper 
  #outcome vars are handled separately because their type gets modified back
   # to string by modify variable text
  
  factor_var <- c(
    "cur_pref", 
    "young_pref",
    "party_id",
    "resp_gender",
    "kekkon",
    "gakureki_resp",
    "gakureki_otosan",
    "gakureki_okasan",
    "city_size"
  )
  num_vars <- unlist(colnames_map)[! unlist(colnames_map) %in% factor_var]
  
  survey_data$cur_pref <- as.numeric(survey_data$cur_pref)
  survey_data$city_size <- as.numeric(survey_data$city_size)
  survey_data$party_id <- as.numeric(survey_data$party_id)
  
  
  return(survey_data)
  
}

modify_variable_text <- function(survey_data){
  survey_data <- survey_data %>% 
    mutate(father_job = stringr::str_remove(father_job, "e://Field/"),
           high_school = stringr::str_remove(high_school, "e://Field/"),
           university = stringr::str_remove(university, "e://Field/"),
           job = stringr::str_remove(job, "e://Field/"))
  
  survey_data[] <- lapply(survey_data, function(x) gsub("[${}\n]", "", x))
  return(survey_data)
}

change_conjoint_feature_name <- function(survey_data, attribute_name_map){
  # gives more legible names to the conjoint features
  original_attribute_name <- names(attribute_name_map)
  colnames(survey_data) <- sapply(
    colnames(survey_data), 
    query_map_object, 
    original_attribute_name, 
    attribute_name_map)
  
  return(survey_data)
}


change_conjoint_level_values <- function(survey_data, map_object){
  # Translates the conjoint level values from japanese to english
  
  items_to_replace <- 1:length(map_object)
  
  for (i in items_to_replace){
    old_val <- names(map_object)[[i]]
    new_val <- map_object[[i]]
    survey_data <- replace(survey_data, survey_data == old_val, new_val)
  }
  
  return(survey_data)
}

make_conjoint_vars_factors <- function(survey_data, conjoint_var_names){
  # conjoint_var_names: vector of strings 
  
  survey_data[, conjoint_var_names] <- lapply(survey_data[, conjoint_var_names], as.factor)
  
  return(survey_data)
}

make_outcome_vars_numeric <- function(survey_data){
  # Makes the outcome variables nuemric
  
  outcome_vars <- c("good_me", "good_country", "good_region", "vote_choice")
  survey_data[, outcome_vars] <- sapply(survey_data[, outcome_vars], as.numeric) 
  
  return(survey_data)
}


process_outcome_first_cand <- function(first_cand_df, outcomes) {
  # indicator of whether candidate was selected in task
  first_cand_df <- first_cand_df %>% 
    dplyr::mutate(across(all_of(outcomes), function(x) as.numeric(x == 1)))
  
  return(first_cand_df)
}

process_outcome_second_cand <- function(second_cand_df, outcomes) {
  # indicator of whether candidate was selected in task
  
  second_cand_df <- second_cand_df %>% 
    dplyr::mutate(across(all_of(outcomes), function(x) as.numeric(x == 2)))
  
  return(second_cand_df)
}


path_to_data <- 'data/main_survey_final.csv'

path_to_colnames <- 'mappings/colnames_mapper.yaml'
path_to_attribute_rename <- 'mappings/attribute_mapper.yaml'
path_to_plot_labels <- 'mappings/plot_label_mapper.yaml'

output_file <- "./data/first_survey_formatted.rds"

first_outcome <- c("Q26",  "Q167", "Q172", "Q177", "Q182", "Q186", "Q189", "Q213")
second_outcome <- c("Q108_1", "Q169_1", "Q174_1", "Q179_1", "Q184_1", "Q187_1", "Q190_1", "Q214_1")
third_outcome <- c("Q108_2", "Q169_2", "Q174_2", "Q179_2", "Q184_2", "Q187_2", "Q190_2", "Q214_2")
fourth_outcome <- c("Q108_3", "Q169_3", "Q174_3", "Q179_3", "Q184_3", "Q187_3", "Q190_3", "Q214_3")


colnames_map <- yaml::read_yaml(path_to_colnames)
attribute_rename <- yaml::read_yaml(path_to_attribute_rename)
plot_label_map <- yaml::read_yaml(path_to_plot_labels)

initial_file <- read.csv(path_to_data)

# The following lines read in the qualtrics file and prepare the proper 
# data format for the conjoint experiment 

formatted_data <- prepare_conjoint_format(initial_file, colnames_map, attribute_rename)
formatted_data <- fix_variable_format(formatted_data, colnames_map)
formatted_data <- modify_variable_text(formatted_data)
formatted_data <- generate_new_variables(formatted_data)
formatted_data <- change_conjoint_feature_name(formatted_data, attribute_rename)
formatted_data <- change_conjoint_level_values(formatted_data, plot_label_map)
formatted_data <- make_conjoint_vars_factors(formatted_data, unname(unlist(attribute_rename)))
formatted_data <- make_outcome_vars_numeric(formatted_data)

saveRDS(formatted_data, file=output_file)





